---
title: "Crowdsourced Adaptive Surveys Replication"
author: "Yamil Velez"
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, fig.width = 10, fig.height = 8, warning = FALSE, error = FALSE, message = FALSE)
```

## Figure 2
```{r fig2}
library(qualtRics)
library(magrittr)
library(tidyverse)
library(hrbrthemes)
library(modelsummary)

survey <- qualtRics::read_survey("cr_issues.csv") %>% filter(Progress == 100)

survey %>%
 mutate(pid7 = case_when(pid3 == 1 & strong == 2 ~ 7,
                         pid3 == 1 & strong == 4 ~ 6,
                         pid3 > 2 & pid_lean == 2 ~ 5,
                         pid3 > 2 & pid_lean == 3 ~ 4,
                         pid3 > 2 & pid_lean == 1 ~ 3,
                         pid3 == 2 & strong == 4 ~ 2,
                         pid3 == 2 & strong == 2 ~ 1),
        age = 2023-birthyr_e) -> survey


survey %>%
 select(personal_issue,
        matches("q\\d+_2")) %>%
 pivot_longer(cols = everything()) %>%
 bind_cols(survey %>%
 select(contains("issue_importance")) %>%
 pivot_longer(cols = everything(), names_to = "issue", values_to = "importance")) %>%
 bind_cols(bind_cols(tibble(p0_2 = 1) %>% bind_cols(survey %>%
                                                     select(matches("p\\d+_2")))) %>% pivot_longer(everything(), values_to = "ipw") %>% select(-name)) %>%
 filter(name != "personal_issue") %>%
group_by(value) %>%
 summarize(mean_importance = weighted.mean(importance, na.rm = T,
                                           w = 1/ipw),
           se_importance = sqrt(sd(importance, na.rm = T)/n()),
           n = n()) %>%
 filter(n >= 50) %>%
 na.omit() %>%
 filter(se_importance != 0) %>%
 ggplot(aes(y = forcats::fct_reorder(value, mean_importance),
            x = mean_importance,
            xmin = mean_importance - 1.96*se_importance,
            xmax = mean_importance + 1.96*se_importance)) +
 geom_point(aes(y = forcats::fct_reorder(value, mean_importance),
                x = mean_importance,
                label = forcats::fct_reorder(value, mean_importance))) +
  geom_pointrange() +
  theme_ipsum_rc() +theme(axis.line.y = element_blank(),
                          axis.text.x = element_text(size = 14),
                          axis.title.x = element_text(size = 14),
                          axis.text.y = element_text(size = 14),
                          panel.grid.minor = element_blank(),
                          legend.position = "bottom",
                          plot.margin = 
                            margin(t = 20, r = 20, 
                                   b = 20, l = 20)) +
  labs(y = "",
       x = "Mean Importance")
```

## Figure 3
```{r fig3}
survey2 <- qualtRics::read_survey("cr_misinfo.csv") %>% filter(Progress == 100) 

survey2 %>% transmute(question = q1, rating = pre_beliefs_9, probability = p1) %>%
  bind_rows(survey2 %>%
              transmute(question = q2, rating = pre_beliefs_10, probability = p2)
  ) %>%
  bind_rows(survey2 %>%
              transmute(question = q3, rating = pre_beliefs_11, probability = p3)
  ) %>%
  bind_rows(survey2 %>%
              transmute(question = q4, rating = pre_beliefs_12, probability = p4)
  ) %>%
  na.omit() %>%
  mutate(question = str_wrap(str_extract(question, "^.*?\\."), 80)) %>%
  mutate(seed_question = as.numeric(grepl("This year, there was|
                                          Joe Biden authorized|
                                          There were no|
                                          Trump will deport", question))) %>%
  group_by(question) %>%
  dplyr::summarize(mean_rating = weighted.mean(rating, na.rm = T, 
                                               w = 1/probability),
                   sd_rating = sd(rating, na.rm = T),
                   se = sd_rating/sqrt(n()),
                   n_size = n(),
                   seed_question = as.factor(max(seed_question))) %>%
  na.omit() %>%
  filter(n_size >= 10 & se != 0) %>%
  arrange(-mean_rating) %>%
  ggplot(aes(y = forcats::fct_reorder(question, mean_rating), 
             x = mean_rating,
             color = seed_question)) +
  geom_point(size = 5) +
  geom_linerange(aes(xmin = mean_rating - 1.96*se,
                     xmax = mean_rating + 1.96*se),
                 size = .9) +
  theme_ipsum_rc() +
  scale_color_grey(start = .1, end = .6) +
  theme(axis.line.y = element_blank(),
        axis.text.y = element_text(size = 10, 
                                   lineheight = .7,
                                   hjust = .5), 
        axis.text.x = element_text(size = 10),
        axis.title.y.right = element_text(size = 10),
        axis.title.x = element_text(size = 10),
        plot.margin = margin(10, 10, 10, 10, "mm"),
        legend.position = "none") +
  scale_y_discrete(position = "right") +
  labs(y = "", x = "Accuracy Rating", 
       size = "Number of Ratings", 
       title = "Negative Claims about Parties and Candidates") 
```

## Figure B1
```{r fig b1}
survey %>%
 mutate(
  pid_cat = case_when(
   !is.na(strong) & pid3 == 1 & strong == 2 ~ 1,
   !is.na(strong) & pid3 == 1 & strong == 4 ~ 1,
   !is.na(strong) & pid3 == 2 & strong == 2 ~ 2,
   !is.na(strong) & pid3 == 2 & strong == 4 ~ 2,
   pid3 == 1 & pid_lean == 2 ~ 1,
   pid3 == 1 & pid_lean == 1 ~ 2,
   pid3 == 3 & pid_lean == 3 ~ 3,
   TRUE ~ as.numeric(NA)
  )
 ) %>% 
 select(personal_issue,
        pid_cat,
        matches("q\\d+_2")) %>%
 pivot_longer(cols = -pid_cat) %>%
 bind_cols(survey %>%
            select(contains("issue_importance")) %>%
            pivot_longer(cols = everything(), names_to = "issue", values_to = "importance")) %>%
 bind_cols(bind_cols(tibble(p0_2 = 1) %>% 
                      bind_cols(survey %>% select(matches("p\\d+_2")))) %>% 
            pivot_longer(everything(), values_to = "ipw") %>% select(-name)) %>%
 filter(name != "personal_issue") %>%
 group_by(value, pid_cat) %>%
 summarize(mean_importance = weighted.mean(importance, w = 1/ipw, na.rm = TRUE),
           se_importance = sqrt(var(importance, na.rm = TRUE) / length(importance)),
           n = n(), .groups = "drop") %>%
 pivot_wider(names_from = pid_cat, 
             values_from = c(mean_importance, se_importance, n), 
             names_glue = "{.value}_{pid_cat}") %>%
 mutate(plot_priority = abs(mean_importance_1 - mean_importance_2),
        min_n = as.numeric(n_1 >= 12)/as.numeric(n_2 >= 12),
        avg_n = (n_1+n_2)/2) %>%
 filter(min_n == 1) %>%
 na.omit() %>%
 mutate(diff_flag = as.factor(abs(mean_importance_1 - mean_importance_2)/sqrt(se_importance_1^2 + 
                                                                               se_importance_2^2) >= 1.96)) %>%
 ggplot(aes(x = `mean_importance_1`, 
            xend = `mean_importance_2`,
            linetype = diff_flag,
            y = forcats::fct_reorder(value, plot_priority))) +
 geom_linerange(aes(xmin = `mean_importance_1`,
                    xmax = `mean_importance_2`)) +
 geom_point(aes(x = `mean_importance_2`, color = "Republican"),
            size = 3) +
 geom_point(aes(x = `mean_importance_1`,
                color = "Democrat"),
            size = 3) +
 scale_linetype_manual(values = c("dashed", "solid")) + 
 scale_color_manual(values = c("skyblue", "salmon")) + 
 theme_ipsum_rc() +
 labs(x = "Mean Importance", y = "Issue") +
 theme(panel.grid.minor = element_blank(),
       axis.title.x = element_text(size = 14),
       axis.title.y = element_text(size = 14),
       axis.text.y = element_text(size = 14),
       legend.position = "none") 
```

```{r fig b2}
survey2 <- survey2 %>%
  filter(Progress == 100) %>%
  mutate(
    trust_cat = ntile(6-news_trust_2, 2),
    trust_cat = factor(trust_cat, levels = c(1, 2), labels = c("Trust", "Distrust"))
  )

survey2_long <- bind_rows(
  survey2 %>%
    transmute(question = q1, rating = pre_beliefs_9, probability = p1, trust_cat),
  survey2 %>%
    transmute(question = q2, rating = pre_beliefs_10, probability = p2, trust_cat),
  survey2 %>%
    transmute(question = q3, rating = pre_beliefs_11, probability = p3, trust_cat),
  survey2 %>%
    transmute(question = q4, rating = pre_beliefs_12, probability = p4, trust_cat)
) %>%
  na.omit()

survey2_long <- survey2_long %>%
  mutate(question = str_extract(question, "^.*?\\.")) %>%
  group_by(question, trust_cat) %>%
  summarize(
    mean_rating = weighted.mean(rating, w = 1/probability, na.rm = TRUE),
    se_rating = sd(rating, na.rm = TRUE) / sqrt(n()),
    n_size = n(),
    .groups = 'drop'
  )

survey2_wide <- survey2_long %>%
  pivot_wider(
    names_from = trust_cat,
    values_from = c(mean_rating, se_rating, n_size),
    names_glue = "{.value}_{trust_cat}"
  )

survey2_wide <- survey2_wide %>%
  mutate(
    plot_priority = abs(mean_rating_Distrust - mean_rating_Trust),
    min_n = as.numeric(n_size_Distrust >= 5 & n_size_Trust >= 5),
    avg_n = (n_size_Distrust + n_size_Trust) / 2
  ) %>%
  filter(min_n == 1) %>%
  na.omit()

survey2_wide <- survey2_wide %>%
  mutate(
    diff_flag = as.factor(
      abs(mean_rating_Distrust - mean_rating_Trust) /
      sqrt(se_rating_Distrust^2 + se_rating_Trust^2) >= 1.96
    )
  )

ggplot(survey2_wide, aes(
  x = mean_rating_Distrust,
  xend = mean_rating_Trust,
  linetype = diff_flag,
  y = forcats::fct_reorder(question, plot_priority)
)) +
  geom_linerange(aes(xmin = mean_rating_Distrust, xmax = mean_rating_Trust), size = 1) +
  geom_point(aes(x = mean_rating_Distrust, color = "Distrust"), size = 3) +
  geom_point(aes(x = mean_rating_Trust, color = "Trust"), size = 3) +
  scale_linetype_manual(values = c("dotted", "solid")) +
  scale_color_manual(values = c("salmon", "darkgreen")) +
  theme_ipsum_rc() +
  labs(x = "Mean Rating", y = "Question") +
  theme(
    panel.grid.minor = element_blank(),
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    axis.text.y = element_text(size = 12),
    legend.position = "none"
  ) 

```

```{r fig e1}
library(rio)

mistral_issues <- import("open_source_comparison/mistral_analysis.csv")
mistral_claims <- import("open_source_comparison/mistral_analysis2.csv")

mistral_issues %>%
  mutate(study = "Most Important Issue Study") %>%
  bind_rows(mistral_claims %>%
              mutate(study = "Misinformation Study")) %>%
 mutate(coding = plyr::mapvalues(coding,
                                 c("different", "same", "similar"),
                                 c("Conflicting", "Identical", "Comparable"))) %>%
  ggplot(aes(x = coding)) +
  geom_bar() +
  facet_wrap(. ~ study, nrow = 2) +
  theme_ipsum_rc() +
  labs(y = "Count",
       x = "Coding") +
  theme(axis.text.x = element_text(size = 15),
        axis.title.x = element_text(size = 15),
        axis.text.y = element_text(size = 15),
        axis.title.y = element_text(size = 15))

```

```{r fig e2}

llama_issues <- import("open_source_comparison/llama_analysis.csv")
llama_claims <- import("open_source_comparison/llama_analysis2.csv")

llama_issues %>%
 transmute(study = "Most Important Issue Study",
           Codes) %>%
  bind_rows(llama_claims %>%
  transmute(study = "Misinformation Study",
            Codes = Rep_Comparison)) %>%
  bind_rows(
    llama_claims %>%
      transmute(study = "Misinformation Study",
            Codes = Dem_Comparison)) %>%
     mutate(coding = plyr::mapvalues(Codes,
                                 c(3, 1, 2),
                                 c("Conflicting", "Identical", "Comparable"))) %>%
 na.omit() %>%
 ggplot(aes(x = coding)) +
 geom_bar() +
 facet_wrap(. ~ study, nrow = 2) +
 theme_ipsum_rc() +
 labs(y = "Count",
      x = "Coding") +
 theme(axis.text.x = element_text(size = 15),
       axis.title.x = element_text(size = 15),
       axis.text.y = element_text(size = 15),
       axis.title.y = element_text(size = 15))


```

```{r fig h1}

unique_claims_df <- read.csv("embeddings/unique_claims.csv")

preprocess_text <- function(text) {
 text <- tolower(text)
 text <- removePunctuation(text)
 text <- removeNumbers(text)
 text <- removeWords(text, stopwords("en"))
 text <- wordStem(text, language = "en")
 text <- stripWhitespace(text)
 return(text)
}

count_unique_claims <- function(claims_list) {
 sapply(claims_list, function(claims) length(unique(unlist(strsplit(as.character(claims), ", ")))))
}

unique_neg_rep_counts <- count_unique_claims(unique_claims_df$unique_neg_rep_claims)
unique_neg_dem_counts <- count_unique_claims(unique_claims_df$unique_neg_dem_claims)

data_for_plot_claims <- data.frame(
 Threshold = rep(unique_claims_df$threshold, 2),
 Count = c(unique_neg_rep_counts, unique_neg_dem_counts),
 Category = rep(c("Negative Republican Claims", 
                  "Negative Democrat Claims"), each = length(unique_claims_df$threshold))
)

ggplot(data_for_plot_claims, aes(x = Threshold, y = Count, color = Category)) +
  geom_point() +
 geom_line() +
 labs(title = "Number of Unique Claims by Threshold", x = "Cosine Similarity Threshold", y = "Number of Unique Claims") +
 theme_minimal() +
 theme(legend.position = "bottom")

```

```{r fig h2}
unique_issues_df <- read.csv("embeddings/unique_issues.csv")

count_unique_issues <- function(issues_list) {
 sapply(issues_list, function(issues) length(unique(unlist(strsplit(as.character(issues), ", ")))))
}

unique_personal_issues_counts <- count_unique_issues(unique_issues_df$unique_personal_issues)

data_for_plot_issues <- data.frame(
 Threshold = unique_issues_df$threshold,
 Count = unique_personal_issues_counts
)

ggplot(data_for_plot_issues, aes(x = Threshold, y = Count)) +
 geom_bar(stat = "identity", fill = "blue", alpha = 0.7) +
 labs(title = "Number of Unique Issues by Threshold", x = "Cosine Similarity Threshold", y = "Number of Unique Issues") +
 theme_minimal()

```

```{r table i1}

survey$time <- with(survey, {
 numeric_dates <- as.numeric(difftime(as.POSIXct(StartDate, 
                                                 format = "%m/%d/%y %H:%M"), 
                                      as.POSIXct("2023-07-06 18:49:00", 
                                                 format = "%Y-%m-%d %H:%M:%S"), 
                                      units = "secs"))
 (numeric_dates - min(numeric_dates, na.rm=T)) / (max(numeric_dates,na.rm=T) - min(numeric_dates,na.rm=T))
})

survey$race_1 <- as.numeric(!is.na(survey$race_1))

modelsummary(lm(time ~ gender + pid7 + education + income + ideology + race_1 + age, survey), 
             stars = T, 
             out = "latex")
```

```{r fig k1}
survey3 <- rio::import("cr_local.csv")

survey3 %>%
 group_by(question) %>%
 summarize(m_rating = weighted.mean(rating, 1/probability,
                                    na.rm=T),
           se = sd(rating,na.rm=T)/sqrt(n()),
           ratings = n()) %>%
 ungroup() %>%
 filter(se != 0) %>%
 ggplot(aes(y = forcats::fct_reorder(question, m_rating),
            x = m_rating,
            xmin = m_rating - 1.96*se,
            xmax = m_rating + 1.96*se)) +
 geom_point(size = 5,
            position = position_dodge(.5)) +
 geom_linerange(aes(xmin = m_rating - 1.96*se,
                    xmax = m_rating + 1.96*se),
                size = .9,
                position = position_dodge(.5)) +
 theme_ipsum_rc() +
 scale_color_grey(start = .1, end = .6) +
 theme(axis.line.y = element_blank(),
       axis.text.y = element_text(size = 10, 
                                  lineheight = .7,
                                  hjust = .5), # Adjust size and lineheight
       axis.text.x = element_text(size = 10),
       axis.title.y.right = element_text(size = 10),
       axis.title.x = element_text(size = 10),
       plot.margin = margin(10, 10, 10, 10, "mm")) +
 scale_y_discrete(position = "right") +
 labs(y = "", x = "Importance Rating", 
      title = "Local Issue Priorities") 
```